********************************************************************************
clear all
set more off
capture log close

********************************************************************************
** File paths
** Set the current directory to the same folder where the code is

global input "../input" 
global output "../output" 
global temp "../temp"

********************************************************************************
** Start log, save it into temp folder
log using "$temp/get_data", replace

********************************************************************************
import delimited using $input/randomization_group.csv, varnames(1) case(preserve) clear
rename page vPage
compress
save $temp/randomization, replace

********************************************************************************
import delimited using $input/randomization_treatment.csv, varnames(1) clear
describe
compress
save $temp/treatment, replace

********************************************************************************
** Page length
********************************************************************************
import delimited using $input/pagelength_images.csv, varnames(1) clear

tabulate lang 

** Get treatment indicator
merge m:1 lang page using $temp/treatment
drop _merge

gen vMonth = mofd(date(date,"YMD"))
format vMonth %tm
list vMonth date in 1/10
drop date

** vPageID is a string variable: combination of WikiData item id and language
gen vPageID = page+ "-" + lang

** vNumPageID is a unique numeric ID as a combination of WikiData item id and language; 240 page ids
egen vNumPageID = group(vPageID)

sum vNumPageID
duplicates report vMonth vNumPageID

** This generates balanced panel with zeros 
** The original dataset does not have obs if page length is 0
xtset vNumPageID vMonth
tsfill, full
tabulate vMonth

** vTreatmentGroup is an indicator variable whether the city-lang combination was assigned to the treatment group or not
** The variable does not vary over time (for treatment group pages it equals one both before and after treatment)
egen vTreatmentGroup = mean(treatment), by(vNumPageID) 
tabulate vMonth vTreatmentGroup

** vPage: Wikidata item ID; 60 items 
egen vPage = mode(page), by(vNumPageID) 
tabulate vPage
egen vLanguage = mode(lang), by(vNumPageID) 
tabulate vLanguage

foreach var of varlist length plain images {
	replace `var' = 0 if `var' == .
}

tabulate vPage vLanguage if vTreatmentGroup ==1
tabulate vPage vLanguage if vTreatmentGroup ==0

keep vNumPageID vMonth vPage vLanguage vTreatmentGroup length plain images

label variable vMonth "Months"

compress

sum vNumPageID vMonth vTreatmentGroup length plain images

save $temp/pagelength, replace

********************************************************************************
** Note: the variables users/edits do not include experimental users/edits
********************************************************************************
import delimited using $input/users_edits.csv, varnames(1) clear

gen vMonth = mofd(date(date,"YMD"))
format vMonth %tm
list vMonth date in 1/10

tabulate vMonth

** Only half month in September 2018, therefore drop Sep 2018
drop if vMonth == tm(2018m9)

rename page vPage
rename lang vLanguage

rename user_edit_users vUsersReg
rename anonymous_edit_users vUsersAnon
gen vUsers = vUsersReg + vUsersAnon

** Note: vEditDays is what we call edits in the text (edits collapsed to daily level like in Aaltonen-Seiler)
gen vEditDays = user_edit_days + anonymous_edit_days

** Edits that don't touch our treatment text
gen vEditsNotOurText = anonymous_edit_days_nontt + user_edit_days_nontt

keep vMonth vPage vLanguage vUsers* vEditDays vEditsNotOurText

compress

save $temp/users, replace

********************************************************************************
** Edit distance 
** granularity level of character
********************************************************************************
import delimited using $input/editdistance_character_minoredits_numericuserid.csv, varnames(1) clear

gen vMonth = mofd(date(date,"YMD"))
format vMonth %tm
list vMonth date in 1/10

tabulate vMonth
drop if vMonth> tm(2018m9)

** Only half month in September 2018, therefore drop Sep 2018
drop if vMonth == tm(2018m9)

rename page vPage
rename lang vLanguage
tabulate vLanguage

tabulate usertype
tabulate vMonth usertype

** Measure how much experiment added
** For each page-language calculate how much treatment added
** It added only in Aug 2014 
preserve
keep if usertype == "e"
keep if vLanguage != "nl"
sum inserted deleted
gen vTreatmentAdded = inserted - deleted 
collapse (sum) vTreatmentAdded, by(vMonth vPage vLanguage)
tabulate vMonth
drop vMonth
sum vTreatmentAdded, detail
save $temp/temp_treatment_added_distance, replace
restore

** For the main edit distance variables drop experimental users
drop if usertype == "e"

** Minor edits: this edit characteristic can only exist for registered editors
gen vIndMinorEdit = (minor == 1) if usertype == "r"
tabulate minor vIndMinorEdit
tabulate vIndMinorEdit
drop minor

sum inserted deleted
rename inserted vEditAdd
rename deleted vEditDel
sum if vEditAdd == 0 & vEditDel==0
sum if vEditAdd == 0 & vEditDel>0

gen vEditDist = vEditAdd + vEditDel

sum vEditAdd vEditDel vEditDist

** Generate data for minor edits pre treatment
preserve
keep if vMonth<tm(2014m8)
keep if usertype == "r"
keep vMonth date vPage vLanguage numericuserid usertype vIndMinorEdit
rename vIndMinorEdit vIndicator
gen vEditType = "Minor edit"
sum vMonth
tabulate vLanguage
save $temp/temp_minor_edits, replace
restore

** Generate capped edit distances, where the cap is at user-day level
** For that collapse in 2 steps
** (1) collapse to daily level; (2) to monthly level
** Here collapse to daily level
collapse (sum) vEditAdd* vEditDel* vEditDist* , by(vMonth vPage vLanguage date numericuserid)

** Calculate the capped measure
sum vEditDist, detail
display "p90 = `r(p90)'"
local lCap = `r(p90)'
display "Median = `r(p50)'"
gen vEditDistDailyCap = min(vEditDist, `lCap')

** Generate edit level data pre treatment
preserve
keep if vMonth<tm(2014m8)
tabulate vMonth
keep  vMonth date vPage vLanguage numericuserid vEditAdd vEditDel
duplicates report  vMonth date vPage vLanguage numericuserid
save $temp/temp_delete_add_edits, replace
restore

** Step 2: collapse to monthly level
collapse (sum) vEditAdd* vEditDel* vEditDist*  , by(vMonth vPage vLanguage)

sum vEditAdd* vEditDel* vEditDist* 

compress
save $temp/editdistance, replace

********************************************************************************
** Merge all files together
********************************************************************************
use $temp/pagelength, clear

tabulate vMonth

merge 1:1 vMonth vPage vLanguage using $temp/users, keepusing(vUsers vEditDays vEditsNotOurText)
drop _merge

merge 1:1 vMonth vPage vLanguage using $temp/editdistance, keepusing(vEditDist*)
drop _merge

merge m:1 vPage vLanguage using $temp/temp_treatment_added_distance, keepusing(vTreatmentAdded)
drop _merge

** For vEditDays vEditsNotOurText vEditDist* vUsers replace by 0 if is empty
foreach var of varlist vEdit* vUsers*  {
	replace `var' = 0 if `var' == . & vMonth!=tm(2018m9)
}

replace vTreatmentAdded = 0 if vTreatmentAdded == . 

merge m:1 vPage using $temp/randomization
drop _merge

********************************************************************************
** Generate variables
********************************************************************************

** Generate numeric Wikidata item id and numeric language id
encode vPage, gen(vNumPage)
encode vLanguage, gen(vNumLanguage)

** Generate post-treatment years 1-4
gen vGroupPostTrYear = 0
replace vGroupPostTrYear = 1 if vMonth>=tm(2014m9) & vMonth<= tm(2015m8)
replace vGroupPostTrYear = 2 if vMonth>=tm(2015m9) & vMonth<= tm(2016m8)
replace vGroupPostTrYear = 3 if vMonth>=tm(2016m9) & vMonth<= tm(2017m8)
replace vGroupPostTrYear = 4 if vMonth>=tm(2017m9) & vMonth<= tm(2018m8)

gen vGroupPostTrAll = (vMonth>=tm(2014m9) & vMonth<= tm(2018m8))
tabulate vGroupPostTrYear vGroupPostTrAll 

** Generate post-treatment yearly averages of editing measures
egen vAverUsersY = mean(vUsers), by(vGroupPostTrYear vNumPageID)
egen vAverEditDaysY = mean(vEditDays), by(vGroupPostTrYear vNumPageID)
egen vAverEditDistY = mean(vEditDist), by(vGroupPostTrYear vNumPageID)
egen vAverEditDistDCapY = mean(vEditDistDailyCap), by(vGroupPostTrYear vNumPageID)
egen vAverEditsNotTTY = mean(vEditsNotOurText), by(vGroupPostTrYear vNumPageID)

** Want only one observation per page-year, simply delete the others
** Keep only August of each year 
foreach x in vAverUsersY vAverEditDaysY vAverEditDistY vAverEditDistDCapY vAverEditsNotTTY {
forvalues y = 1/4 {
	local z = 2014 + `y' 
	gen `x'r`y' = `x' if vMonth == tm(`z'm8)
}
}

** Just checking
list vAverUsersY vAverUsersYr1 if vAverUsersYr1!=.
sum vAverUsersY vAverUsersYr3 if vAverUsersYr3!=.
sum vAverEditDistY vAverEditDistYr1 if vAverEditDistYr1!=.
sum vAverEditDistY vAverEditDistYr4 if vAverEditDistYr4!=.

drop vAverUsersY vAverEditDaysY vAverEditDistY vAverEditDistDCapY vAverEditsNotTTY

********************************************************************************
** Indicators for treatment group and post-treatment periods interactions
gen vAfterTreatment  = (vMonth>tm(2014m8))*vTreatmentGroup
gen vAfterTreatment1  = (vMonth>tm(2014m8) & vMonth<=tm(2015m8))*vTreatmentGroup
gen vAfterTreatment2  = (vMonth>tm(2015m8) & vMonth<=tm(2016m8))*vTreatmentGroup
gen vAfterTreatment3  = (vMonth>tm(2016m8) & vMonth<=tm(2017m8))*vTreatmentGroup
gen vAfterTreatment4  = (vMonth>tm(2017m8) & vMonth<=tm(2018m8))*vTreatmentGroup
tabulate vMonth vAfterTreatment
tabulate vMonth vAfterTreatment1

** Generate a special treatment group indicator that equals 0 for all Dutch pages
gen vTreatmentGroupDutchControl = vTreatmentGroup
replace vTreatmentGroupDutchControl = 0 if vLanguage == "nl"
tabulate vLanguage vTreatmentGroupDutchControl

********************************************************************************
** Log length
rename length vLength 
gen vLogLength = log(vLength)

********************************************************************************
** Page length minus treatment (after treatment substract what treatment added)
********************************************************************************
** Note treatment took place in Aug 2014
** Page length is measured in the beginning of the month
gen vLengthMinusTreatment = vLength 
replace vLengthMinusTreatment = vLength - vTreatmentAdded if vMonth>=tm(2014m9)
tabstat vLengthMinusTreatment vLength, by(vLanguage)
sum vLengthMinusTreatment if vMonth>=tm(2014m9), detail
sum vLength if vMonth>=tm(2014m9), detail
gen vLogLengthMinusTreatment = log(vLengthMinusTreatment)

** Want to observe length in the end of the 1st-4th year, 
** hence beginning of September of each year
forvalues y = 1/4 {
	local z = 2014 + `y' 
	gen vLengthMinusTreatmentYr`y' = vLengthMinusTreatment if vMonth == tm(`z'm9)
	gen vLogLengthMinusTreatmentYr`y' = vLogLengthMinusTreatment if vMonth == tm(`z'm9)
}
tabstat vLogLengthMinusTreatmentYr*, by(vMonth)

********************************************************************************
** Past variables: for length, users, edits, edit distance
********************************************************************************
gen tempLogLength2014Aug = vLogLength if vMonth==tm(2014m8)
tabstat tempLogLength2014Aug, by(vMonth)
egen vLogLength2014Aug = max(tempLogLength2014Aug), by(vNumPageID)
tabstat tempLogLength2014Aug vLogLength2014Aug, by(vMonth)
drop temp*

** All pages exist starting from Feb 2010
local lPreTreatmentMonths " vMonth>=tm(2010m2) & vMonth<= tm(2014m7) "
sum vLength vUsers vEditDays vEditDist vEditDistDailyCap if `lPreTreatmentMonths'

foreach x in vUsers vEditDays vEditDist vEditDistDailyCap {
	egen tempA`x' = mean(`x') if `lPreTreatmentMonths', by( vNumPageID)
}

tabstat tempAvUsers, by(vMonth)

egen vAverUsersPreTreatment = max(tempAvUsers) , by(vNumPageID)
egen vAverEditDaysPreTreatment = max(tempAvEditDays) , by(vNumPageID)
egen vAverEditDistPreTreatment = max(tempAvEditDist) , by(vNumPageID)
egen vAverEditDistDCapPreTreatment = max(tempAvEditDistDailyCap) , by(vNumPageID)

tabstat tempAvUsers vAverUsersPreTreatment, by(vMonth)

tabulate vPage vLanguage if vAverEditDistPreTreatment==0

drop temp*

********************************************************************************
** For panel data model
********************************************************************************
** Page age: want to exclude pages with 0 age
egen tempFirstPerPosLength1  = min(vMonth) if vLength>0, by(vNumPageID)
sum tempFirstPerPosLength1
egen tempFirstPerPosLength2  = mean(tempFirstPerPosLength1), by(vNumPageID)
sum tempFirstPerPosLength*
** Page age equals 1 when first month when in the beginning of month pos length
gen vPageAge = vMonth - tempFirstPerPosLength2 + 1
replace vPageAge = 0 if vPageAge < 0
tabstat vPageAge, by(vMonth) stat(min mean)
drop temp*

** Balanced sample: when all pages already exist, starting from Feb 2010
egen tempBalancedPanel = min(vPageAge), by(vMonth)
gen vBalancedPanel = (tempBalancedPanel>0)
tabulate vMonth vBalancedPanel
drop temp*

********************************************************************************
** Median Page age and length
********************************************************************************
** Median is calculated across 3 languages (not for each language separately)

sum vPageAge if vMonth == tm(2014m8) & vLanguage!= "nl", detail
display "Median age `r(p50)'"
gen tempAgeAboveMed = (vPageAge>=`r(p50)') if vMonth == tm(2014m8) & vLanguage!= "nl"
egen vIndPageAgeAboveMed = mean(tempAgeAboveMed), by(vNumPageID) 
sum tempAgeAboveMed vIndPageAgeAboveMed
drop temp*

sum vLength if vMonth == tm(2014m8) & vLanguage!= "nl", detail
display "Median length `r(p50)'"
gen tempLengthAboveMed = (vLength>=`r(p50)') if vMonth == tm(2014m8) & vLanguage!= "nl"
egen vIndLengthAboveMed = mean(tempLengthAboveMed), by(vNumPageID) 
sum tempLengthAboveMed vIndLengthAboveMed
drop temp*

********************************************************************************
** Labels
label variable vTreatmentGroup "Treatment group"
label variable vAfterTreatment "Treatment group, post-treatment"
label variable vAfterTreatment1 "Treatment group, post-treatment 1st year"
label variable vAfterTreatment2 "Treatment group, post-treatment 2nd year"
label variable vAfterTreatment3 "Treatment group, post-treatment 3rd year"
label variable vAfterTreatment4 "Treatment group, post-treatment 4th year"
label variable vTreatmentGroupDutchControl "Treatment group"

label variable vLogLength2014Aug "Log. length before treatment"
label variable vAverUsersPreTreatment "Aver. \# of users before treatment"
label variable vAverEditDaysPreTreatment "Aver. \# of edits before treatment"
label variable vAverEditDistPreTreatment "Aver. edit dist. before treatment"
label variable vAverEditDistDCapPreTreatment "Aver. capped edit dist. before treatment"

label variable vLengthMinusTreatmentYr1 "1st year"
label variable vLengthMinusTreatmentYr2 "2nd year"
label variable vLengthMinusTreatmentYr3 "3rd year"
label variable vLengthMinusTreatmentYr4 "4th year"

label variable vAverUsersYr1 "1st year"
label variable vAverUsersYr2 "2nd year"
label variable vAverUsersYr3 "3rd year"
label variable vAverUsersYr4 "4th year"

label variable vAverEditDaysYr1 "1st year"
label variable vAverEditDaysYr2 "2nd year"
label variable vAverEditDaysYr3 "3rd year"
label variable vAverEditDaysYr4 "4th year"

label variable vAverEditDistYr1 "1st year"
label variable vAverEditDistYr2 "2nd year"
label variable vAverEditDistYr3 "3rd year"
label variable vAverEditDistYr4 "4th year"

********************************************************************************
compress

save $temp/pagelength_users_edits, replace

********************************************************************************
erase $temp/randomization.dta
erase $temp/treatment.dta
erase $temp/editdistance.dta
erase $temp/users.dta
erase $temp/temp_treatment_added_distance.dta

********************************************************************************
log close
clear
